## Setup

# clear workspace
rm(list = ls()); gc()

# load packages
library(tidyverse)
library(survey)
library(stargazer)
library(ggplot2)
library(sjPlot)
library(ggpubr)
library(estimatr)
library(interflex)
library(specr)
library(parallel)
library(MASS)
library(dplyr)
library(car)
library(effects)
library(lmtest)
library(sandwich)
library(texreg)

# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))


# ~~~~~~~~~~


# This script takes <1 minute to run


# ~~~~~~~~~~


## Prep data

# load data, filtering out 1 observation with missing weight
setwd("../data/processed")
data <- readRDS("master_data.rds") %>% filter(!is.na(weight))

# create per-capita hospital beds variable
data$beds_per_10k <- data$beds/(data$population/10000)

# create per-capita COVID fatality variables
data$deaths_per_10k <- data$deaths/(data$population/10000)
data$deaths_30_per_10k <- data$deaths_30/(data$population/10000)

# create natural logged COVID fatality variables | make any negatives equal to 0
# these occur when locals correct previously fatality counts and thus include daily negative values
data$log_deaths <- log(data$deaths_per_10k + 1)
data$log_deaths_30 <- ifelse(data$deaths_30_per_10k >= 0, log(data$deaths_30_per_10k + 1), 0)

# recode subjective concern items 
# (i) "How concerned are you about a coronavirus epidemic in the community where you live?"
data$corona_2a <- ifelse(data$corona_2a == 4, 0, # not concerned at all
                         ifelse(data$corona_2a == 3, 1, # not very concerned
                                ifelse(data$corona_2a == 2, 2, # somewhat concerned
                                       ifelse(data$corona_2a == 1, 3, NA)))) # very concerned

# (ii) How serious a problem COVID-19 in your community?
data$cor_seriousness_community <- ifelse(data$cor_seriousness_community == 4 | data$cor_seriousness_community == 5, 0, # Not a problem | Not sure
                                         ifelse(data$cor_seriousness_community == 3, 1, # A minor problem
                                                ifelse(data$cor_seriousness_community == 2, 2, # Somewhat serious
                                                       ifelse(data$cor_seriousness_community == 1, 3, NA)))) # Very serious

# create period indicator variables to indicate four periods in which the response options labeled as under-predictions remain the same
data$period_1 <- ifelse(data$surveydate %in% c("2020-05-23", "2020-05-30", "2020-06-06", "2020-06-13"), 1, 0)
data$period_2 <- ifelse(data$surveydate %in% c("2020-08-15", "2020-08-22", "2020-08-29", "2020-09-05", "2020-09-12"), 1, 0)
data$period_3 <- ifelse(data$surveydate %in% c("2020-09-19", "2020-09-26", "2020-10-03", "2020-10-10", "2020-10-17", "2020-10-24", "2020-10-31", "2020-11-07"), 1, 0)

# create survey design object
d <- svydesign(ids = ~FIPS, weights = ~weight, data = data)


# ~~~~~~~~~~


## Generate numbers in the main paper text

# (1) number of counties (p. 1)

# generate number and save as txt file
setwd("../../output")
writeLines(paste("Number of counties:", length(unique(data$FIPS))), "pg1_no_counties.txt")

# --

# (2) difference in likelihood of under- and over-estimating by party and educational attainment (pg. 3)
writeLines(paste(paste("Difference in likelihood of underestimating by party (R-D):",
                       round(weighted.mean(data[data$party == "Republican",]$under, w = data[data$party == "Republican",]$weight, na.rm = T) - 
                               weighted.mean(data[data$party == "Democrat",]$under, w = data[data$party == "Democrat",]$weight, na.rm = T),3)),
                 paste("Difference in likelihood of overestimating by party (R-D):",
                       round(weighted.mean(data[data$party == "Republican",]$over, w = data[data$party == "Republican",]$weight, na.rm = T) - 
                               weighted.mean(data[data$party == "Democrat",]$over, w = data[data$party == "Democrat",]$weight, na.rm = T),3)),
                 paste("Difference in likelihood of underestimating by education (High school vs. Bachelors):",
                       round(weighted.mean(data[data$educ3 == "High school or less",]$under, w = data[data$educ3 == "High school or less",]$weight, na.rm = T) - 
                               weighted.mean(data[data$educ3 == "Bachelor's degree or more",]$under, w = data[data$educ3 == "Bachelor's degree or more",]$weight, na.rm = T),3)),
                 paste("Difference in likelihood of overestimating by education (High school vs. Bachelors):", 
                       round(weighted.mean(data[data$educ3 == "High school or less",]$over, w = data[data$educ3 == "High school or less",]$weight, na.rm = T) - 
                               weighted.mean(data[data$educ3 == "Bachelor's degree or more",]$over, w = data[data$educ3 == "Bachelor's degree or more",]$weight, na.rm = T),3)),
                 sep = "\n"), "pg3_misperceptions_by_party_education.txt")

# --

# (3) proportion of survey respondents who prefer national vs. local news (pg. 6)
writeLines(paste(
  paste("Proportion who prefer national news source:", sum(data$econQ2 %in% c(1,2,4))/sum(!is.na(data$econQ2))),
  paste("proportion who prefer local news source:", sum(data$econQ2 %in% c(3,5))/sum(!is.na(data$econQ2))), sep = "\n"), "pg6_news_sources.txt")

# --

# (4) correlation between objective and subjective measures of local coronavirus severity (footnote 8 p. 11)

# generate correlations
num <- data %>% 
  ungroup() %>%
  summarize(c1=cor(log_deaths, corona_2a, use = "pairwise.complete.obs"),
            c2=cor(log_deaths_30, corona_2a, use = "pairwise.complete.obs"),
            c3=cor(log_deaths, cor_seriousness_community, use = "pairwise.complete.obs"),
            c4=cor(log_deaths_30, cor_seriousness_community, use = "pairwise.complete.obs"))

# save as txt file
writeLines(paste(paste("Correlation between logged deaths and concern about epidemic in one's community:", round(num$c1, 3)),
                 paste("Correlation between logged deaths in last 30 days and concern about epidemic in one's community", round(num$c2, 3)),
                 paste("Correlation between logged deaths and perceived seriousness of problem in one's community", round(num$c3, 3)),
                 paste("Correlation between logged deaths in last 30 days and perceived seriousness of problem in one's community", round(num$c4, 3)), 
                 sep = "\n"), "footnote_8.txt"); rm(num)

# --

# (5) distribution of weights (footnote 11 pg. 13)
writeLines(paste(paste("Minimum weight:", round(min(data$weight), 2)),
                 paste("Maximum weight:", round(max(data$weight), 2)),
                 paste("Mean weight:", round(mean(data$weight), 3)),
                 paste("Standard deviation weight:", round(sd(data$weight), 2)), sep = "\n"), "footnote_11.txt")


# ~~~~~~~~~~


## Generate tables

# Table 2

# under ~ logged cumulative deaths
fit1 <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                 log_deaths*covid_nonpartisan_estimate + 
                 unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                 age + female + educ3 + race + inc3 + pid7 + ideo5, 
               design = d, family = quasibinomial(link = "logit"), rescale = T)

# under ~ logged deaths in last 30 days
fit2 <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                 log_deaths_30*covid_nonpartisan_estimate + 
                 unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                 age + female + educ3 + race + inc3 + pid7 + ideo5, 
               design = d, family = quasibinomial(link = "logit"), rescale = T)

# over ~ logged cumulative deaths
fit3 <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                 log_deaths*covid_nonpartisan_estimate + 
                 unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                 age + female + educ3 + race + inc3 + pid7 + ideo5, 
               design = d, family = quasibinomial(link = "logit"), rescale = T)

# over ~ logged deaths in last 30 days
fit4 <- svyglm(over ~ week + period_1 + period_2 + period_3 + 
                 log_deaths_30*covid_nonpartisan_estimate + 
                 unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                 age + female + educ3 + race + inc3 + pid7 + ideo5, 
               design = d, family = quasibinomial(link = "logit"), rescale = T)

# extract values
sum_fit1 <- as.data.frame(summary(fit1)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "log_deaths:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
sum_fit2 <- as.data.frame(summary(fit2)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "log_deaths_30:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
sum_fit3 <- as.data.frame(summary(fit3)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "log_deaths:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
sum_fit4 <- as.data.frame(summary(fit4)$coefficient) %>% mutate(coef = row.names(.)) %>% filter(coef == "log_deaths_30:covid_nonpartisan_estimate") %>% dplyr::select(-`t value`)
rownames(sum_fit1) <- rownames(sum_fit2) <- rownames(sum_fit3) <- rownames(sum_fit4) <- NULL

# make table
tab <- bind_rows(sum_fit1, sum_fit2, sum_fit3, sum_fit4) %>%
  mutate(Estimate = round(Estimate, 3),
         `Std. Error` = round(`Std. Error`, 3),
         `Pr(>|t|)` = round(`Pr(>|t|)`, 3),
         coef = if_else(coef == "log_deaths:covid_nonpartisan_estimate", "cumulative deaths", "deaths in last 30 days"),
         DV = c(rep("Underestimation", 2), rep("Overestimation", 2)),
         Observations = c(nobs(fit1), nobs(fit2), nobs(fit3), nobs(fit4))); rm(sum_fit1, sum_fit2, sum_fit3, sum_fit4)

# save
write.csv(tab, "Table 2.csv", row.names = F)

# save full regression table for Appendix A.7 (Table A.4)
writeLines(stargazer(fit1, fit2, fit3, fit4, no.space = T), "Table A.4.txt")

# --

# Table 3

# convert outcome variable to factor and drop unused response options
data$covid_deaths <- ifelse(data$covid_deaths %in% c(7, 10, 11), NA, data$covid_deaths)

# filter out data with missingness
data_complete <- data %>%
  filter(!is.na(inc3)) %>%
  filter(if_all(c(covid_deaths, median_household_income, rep_share), is.finite)) %>%
  droplevels()

# change variable to categorical labels and factorize
data_complete <- data_complete %>%
  mutate(covid_deaths = case_when(covid_deaths == 1 ~ "(1) Fewer than 100,000",
                                  covid_deaths == 2 ~ "(2) 100,000-150,000",
                                  covid_deaths == 3 ~ "(3) 150,000-200,000",
                                  covid_deaths == 4 ~ "(4) 200,000-250,000",
                                  covid_deaths == 5 ~ "(5) 250,000-1 million",
                                  covid_deaths == 6 ~ "(6) More than 1 million"))

data_complete$covid_deaths <- as.factor(data_complete$covid_deaths)

# fit ordinal logistic regression
fit <- polr(covid_deaths ~ week + period_1 + period_2 + period_3 + 
              log_deaths*covid_nonpartisan_estimate + 
              unemployment + beds_per_10k + rural_urban_continuum + bachelors + rep_share + black + hispanic + 
              age + female + educ3 + race + inc3 + pid7 + ideo5,
            data = data_complete, weights = weight, Hess = T)

# correct for clustered standard errors and save as CSV
write.csv(coeftest(fit, vcov=vcovCL(fit, factor(data_complete$FIPS))), "Table 3.csv", row.names = row.names(coeftest(fit, vcov=vcovCL(fit, factor(data_complete$FIPS)))))

# generate and save Appendix Figure A.13
pdf("Figure A.13.pdf", width = 8, height = 10, bg = "white") # open a graphics device
plot(Effect(focal.predictors = c("log_deaths", "covid_nonpartisan_estimate"), fit,
            xlevels = list(covid_nonpartisan_estimate = c(min(data$covid_nonpartisan_estimate), max(data$covid_nonpartisan_estimate)))),
     xlab = "Logged cumulative per-capita COVID-19 deaths",
     ylab = "Predicted probability",
     main = "")
dev.off() # close graphics device


# ~~~~~~~~~~


## Generate figures

# Figure 3

# have to manually add min and max values to code to make marginal effects plots
min(data$covid_nonpartisan_estimate) # 0.06260057
max(data$covid_nonpartisan_estimate) # 0.2431676

p1 <- plot_model(fit1, type = "pred", terms = c("log_deaths", "covid_nonpartisan_estimate [0.06260057, 0.2431676]")) + 
  theme_bw() + 
  labs(title = "", x = "Logged cumulative per-capita COVID-19 deaths", y = "Underestimating") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_y_continuous(labels = scales::percent, limits = c(0.05, 0.46)) + 
  labs(color = "Media salience") + 
  theme(legend.position = "bottom") + 
  theme(text=element_text(size=24))

p2 <- plot_model(fit2, type = "pred", terms = c("log_deaths_30", "covid_nonpartisan_estimate [0.06260057, 0.2431676]")) + 
  theme_bw() + 
  labs(title = "", x = "Logged per-capita COVID-19 deaths in last 30 days", y = "") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_y_continuous(labels = scales::percent, limits = c(0.05, 0.46)) + 
  labs(color = "Media salience") + 
  theme(legend.position = "bottom") + 
  theme(text=element_text(size=24))

p <- ggarrange(p1, p2, ncol = 2, nrow = 1, common.legend = T, legend = "bottom"); rm(p1, p2)

ggsave(filename = "Figure 3.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white")

# --

# Figure 4
fit1t <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                  log_deaths*covid_nonpartisan_estimate*party + 
                  unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                  age + female + educ3 + race + inc3 + ideo5, 
                design = d, family = quasibinomial(link = "logit"), rescale = T)

fit2t <- svyglm(under ~ week + period_1 + period_2 + period_3 + 
                  log_deaths_30*covid_nonpartisan_estimate*party + 
                  unemployment + beds_per_10k + rural_urban_continuum + bachelors + median_household_income + rep_share + black + hispanic + 
                  age + female + educ3 + race + inc3 + ideo5, 
                design = d, family = quasibinomial(link = "logit"), rescale = T)

p1 <- plot_model(fit1t, type = "pred", terms = c("log_deaths", "covid_nonpartisan_estimate [0.06260057, 0.2431676]", "party")) + 
  theme_bw() + 
  labs(x = "Logged cumulative per-capita COVID-19 deaths", y = "Underestimating", title = "", color = "Media salience") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) +
  theme(legend.position = "bottom",
        text=element_text(size=24))

p2 <- plot_model(fit2t, type = "pred", terms = c("log_deaths_30", "covid_nonpartisan_estimate [0.06260057, 0.2431676]", "party")) + 
  theme_bw() + 
  labs(x = "Logged per-capita COVID-19 deaths in last 30 days", y = "Underestimating", title = "", color = "Media salience") +
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) +
  theme(legend.position = "bottom",
        text=element_text(size=24))

p <- ggarrange(p1, p2, ncol = 1, nrow = 2, common.legend = T, legend = "bottom"); rm(p1, p2)

ggsave(filename = "Figure 4.png", plot = p, device = "png", width = 18, height = 15, units = "in", bg = "white")

# --

# Figure 5
p1 <- plot_model(fit3, type = "pred", terms = c("log_deaths", "covid_nonpartisan_estimate [0.06260057, 0.2431676]")) + 
  theme_bw() + 
  labs(title = "", x = "Logged cumulative per-capita COVID-19 deaths", y = "Overestimating") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.47)) + 
  labs(color = "Media salience") + 
  theme(legend.position = "bottom") + 
  theme(text=element_text(size=24))

p2 <- plot_model(fit4, type = "pred", terms = c("log_deaths_30", "covid_nonpartisan_estimate [0.06260057, 0.2431676]")) + 
  theme_bw() + 
  labs(title = "", x = "Logged per-capita COVID-19 deaths in last 30 days", y = "") + 
  scale_color_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_fill_manual(values = c("forestgreen", "purple"), labels = c("min", "max")) + 
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.47)) + 
  labs(color = "Media salience") + 
  theme(legend.position = "bottom") + 
  theme(text=element_text(size=24))

p <- ggarrange(p1, p2, ncol = 2, nrow = 1, common.legend = T, legend = "bottom"); rm(p1, p2)

ggsave(filename = "Figure 5.png", plot = p, device = "png", width = 18, height = 11, units = "in", bg = "white")

